# Immigration and the top 1%

# Function that calculates summary stats for individuals in top income


# input: tax year

# definition
table_top_share <- function(y){
  
  #print (y)
  
  # number of individuals in top share
  count_top <- how_many_top(y, ts)
  
  # file name (fst file)
  file_name <- paste('datalab_data_folder/SA_PAYE', y, '.fst', sep = '')
  
  # upload
  data <- fst::read_fst(path = file_name, to = count_top + 150000) # size of top share + replacement
  
  # tibble
  data <- as_tibble(data)
  
  # set all variable as numeric except NINO, ID, scheme ref, sex
  ind <- grep('nino_anon|id|schemeref_anon|sex', names(data), invert = T)
  data <- data %>%
    mutate_at(.vars = ind, function(x) {as.numeric(x)})
  
  #recode missings to zero
  data <- data %>% 
    mutate(weight = case_when(is.na(weight) ~ 0,
                                TRUE ~ weight))   
   
  # create tax year variable
  data <- data %>%
    mutate(tax_year = y)
  
  # create post tax variable
  data <- data %>%
    mutate(ti_posttax = ti - it_tot)
  
  # subset data (keep adult)
  data <- subset(data, age_limit)
  
  # keep SA sample 
  if (which_sample == 'SA_only') {
    data <- data %>%
      filter(sa_record == 1)
  }
  
  
  # sort
  data <- data %>%
    arrange(desc(!!rank_var))
  
  
  # weighted ranking
  data <- data %>% mutate(weight = if_else(is.na(weight), 1, weight))
  
  data <- data %>% mutate(w_rank = cumsum(weight))
  
  
  
  # # # #
  
  # 1. all individuals in top (based on weighted ranking)
  top_all_df <- top_all(data, count_top)
  
  # 1a. individuals outside top (complement of #1)
  ntop <- dim(top_all_df)[1]
  notop_df <- data[(ntop + 1):dim(data)[1], ]
  
  # 2. migrants in top
  top_migrants_df <- top_all_df %>%
    filter(!!migrant_dummy_name == 1)
  
  # calculate weighted ranking for migrants only
  top_migrants_df <- top_migrants_df %>%
    mutate(w_rank2 = cumsum(weight))
  
  # calculate how many migrants
  n_migrant <- floor(max(top_migrants_df$w_rank2))
  
  # 3. Top 1 share made by British people WITHOUT replacement
  brits_NoReplace <- top_all_df %>%
    filter(!!migrant_dummy_name == 0)
  
  # brits replacement of migrants
  brits_replacement_df <- brits_replacement(notop_df, n_migrant, migrant_dummy_name)
  
  # 4. Top 1 share made by British people WITH replacemnt
  brits_WithReplace <- bind_rows(brits_NoReplace, brits_replacement_df) 
  
  
  # # # #
  
  ## summary stats
  
  # all
  x1 <- sumstat_w(top_all_df)
  
  # migrants in top share
  x2 <- sumstat_w(top_migrants_df)
    
  # British only Top share, without replacement
  x3 <- sumstat_w(brits_NoReplace)
    
  # British only Top share, with replacement
  x4 <- sumstat_w(brits_WithReplace)
  
  
  # # # #
  
  # add a dummy to distinguish each group
  x1 <- x1 %>% mutate(group = 'all')
  x2 <- x2 %>% mutate(group = 'dummy_on')
  x3 <- x3 %>% mutate(group = 'dummy_off_noreplacem')
  x4 <- x4 %>% mutate(group = 'dummy_off_withreplacem')
  
  # add counts
  x1 <- x1 %>% mutate(topshare_count = count_top,
                      migrants_count = n_migrant)
  x2 <- x2 %>% mutate(topshare_count = count_top,
                      migrants_count = n_migrant)
  x3 <- x3 %>% mutate(topshare_count = count_top,
                      migrants_count = n_migrant)
  x4 <- x4 %>% mutate(topshare_count = count_top,
                      migrants_count = n_migrant)
  
  # memo counts
  x1 <- x1 %>%
    mutate(top_all_df_n = dim(top_all_df)[1],
           max_weight = max(top_all_df$weight),
           weighted_count = max(top_all_df$w_rank),
           paye_matched = sum(top_all_df$paye_record == 1 & top_all_df$sa_record == 1),
           paye_non_matched = sum(top_all_df$paye_record == 1 & top_all_df$sa_record == 0),
           prop = migrants_count/topshare_count)
  
  # memo counts for migrants
  x2 <- x2 %>%
    mutate(paye_matched = sum(top_migrants_df$paye_record == 1 & top_migrants_df$sa_record == 1),
           paye_non_matched = sum(top_migrants_df$paye_record == 1 & top_migrants_df$sa_record == 0))
  
  # bind them together
  x <- bind_rows(x1, x2, x3, x4) %>% 
    as_tibble()
  
  # add tax year
  x <- x %>%
    mutate(tax_year = y)
  
  # return
  return(x)
  
}


